'''
This script reads the original training file which is in the csv format made for the ALTA challenge,
and gathers all the same line number in an increasing order to prepare for the next script to calculate the
statistic for each line number.
The output will separate different line numbers by a new_line character   

A temporary folder called segments will be created and deleted after the process
'''
import os
import sys
import re

num_of_parameters = len(sys.argv)
if (num_of_parameters < 2 or num_of_parameters > 2):
    print "usage: python create_training training_file"
    sys.exit(0)
    
oPath = "segments/"
iPath = sys.argv[1]
oFile = "new_train.csv"

fin = open(iPath,"r")
fin.readline() #First line is heading, must be ignored
if not os.path.exists(oPath):
    os.makedirs(oPath)
else:
    print "Please remove the folder 'segments' from your current directory and re-run the script"
    sys.exit(0)

#write each line to a separate file : file_name will be the line numbers and file contents are
#the labels of each line
while True:
    line = fin.readline()
    if not line:
        break
    lineNumber = int(line.split(",")[3])
    prediction = int(line.split(",")[0])
    if (prediction == 1):
        o = open(oPath+str(lineNumber),"a")
        label = re.sub('"','',line.split(",")[1]) #study design is enclosed with double quotes, which gets removed
        o.write(label+"\n")
        o.close()

#write all files into a single file
num_of_files = len(os.listdir(oPath)) + 1
for i in range(1, num_of_files):
    os.system("cat "+oPath+str(i)+" >> "+oFile)
    os.system("echo >> "+oFile)
    
os.system("rm -r "+oPath)
